Part1 - Classification
1 Task 1 - average monthly earnings
1.1 Choose the response (target) variable - average monthly
earnings
The selected response (target) variables is average monthly
earnings.This variable does not exist in the database, so we created it
using a specific method. We chose “average_monthly_earnings” as target
variable to classify income levels.Due to the wide range of data with a
substantial gap between the lowest and highest values, the median is
used as a substitute for the mean.The value for average monthly earnings
is represented by the median between the original database values of
“lowest_monthly_earnings” and “highest_monthly_earnings”. Due to the
right-skewed distribution of income, we selected the median as the
threshold for determining high income.The labels assigned to the
“average_monthly_earnings” variable are [0,138975) for low income levels
and [138975, Inf) for high income levels.
youtubedata <- youtubedata %>%
mutate(
average_monthly_earnings = apply(select(., c("lowest_monthly_earnings", "highest_monthly_earnings")), 1, median),
average_monthly_earnings_category = cut(
average_monthly_earnings,
breaks = c(0, 138975 , Inf),
labels = c("low", "high"),
right = FALSE
)
)
summary(youtubedata$average_monthly_earnings)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 49038 138975 325318 324150 7225450
table(youtubedata$average_monthly_earnings_category)
##
## low high
## 465 525
Looking at the proportions of outcome variable selected, we can see
that there is a fairly equal split between the two categories.
round(prop.table(table(youtubedata$average_monthly_earnings_category)), 2)
##
## low high
## 0.47 0.53
1.2 Single variable models
Preprocess data.
youtubedata1 <- youtubedata[, c("subscribers","video.views", "uploads", "channel_type", "created_year", "created_month", "created_date", "average_monthly_earnings_category")]
youtubedata1 <- youtubedata1[!is.na(youtubedata1$channel_type), ]
youtubedata1$channel_type <- as.factor(youtubedata1$channel_type)
Split the dataset into training, testing set and a validation (or
calibration) set.
d <- youtubedata1
set.seed(2333)
d$rg <- runif(dim(d)[1])
dTrainAll <- subset(d, rg<=0.7)
dTest <- subset(d, rg>0.7)
vars <- setdiff(colnames(dTrainAll), c("rg","average_monthly_earnings_category"))
catVars <- vars[sapply(dTrainAll[, vars], class) %in%
c('factor', 'character')]
numericVars <- vars[sapply(dTrainAll[, vars], class) %in%
c('numeric', 'integer')]
# split dTrainAll into a training set and a validation (or calibration) set
useForCal <- rbinom(n=dim(dTrainAll)[1], size=1, prob=0.1)>0
dCal <- subset(dTrainAll, useForCal)
dTrain <- subset(dTrainAll, !useForCal)
Build univariate models.
outcome <- 'average_monthly_earnings_category'
pos <- 'high'
mkPredC <- function(outCol, varCol, appCol) {
pPos <- sum(outCol == pos) / length(outCol)
naTab <- table(as.factor(outCol[is.na(varCol)]))
pPosWna <- (naTab/sum(naTab))[pos]
vTab <- table(as.factor(outCol), varCol)
pPosWv <- (vTab[pos, ] + 1.0e-3*pPos) / (colSums(vTab) + 1.0e-3)
pred <- pPosWv[appCol]
pred[is.na(appCol)] <- pPosWna
pred[is.na(pred)] <- pPos
pred
}
mkPredN <- function(outCol, varCol, appCol) {
cuts <- unique(
quantile(varCol, probs=seq(0, 1, 0.1), na.rm=T))
varC <- cut(varCol,cuts)
appC <- cut(appCol,cuts)
mkPredC(outCol,varC,appC)
}
calcAUC <- function(predcol,outcol) {
perf <- performance(prediction(predcol,outcol == pos),'auc')
as.numeric(perf@y.values)
}
Process all variables, including categorical and numeric
variables,and check their AUC scores.
for (v in catVars) {
pi <- paste('pred', v, sep='')
dTrain[,pi] <- mkPredC(dTrain[[outcome]], dTrain[[v]], dTrain[[v]])
dCal[,pi] <- mkPredC(dCal[[outcome]], dCal[[v]], dCal[[v]])
dTest[,pi] <- mkPredC(dTest[[outcome]], dTest[[v]], dTest[[v]])
aucTrain <- roc(dTrain[[outcome]], dTrain[[pi]])$auc
if (aucTrain >= 0.55) {
aucCal <- roc(dCal[[outcome]], dCal[[pi]])$auc
print(sprintf(
"%s: trainAUC: %4.3f; calibrationAUC: %4.3f",
pi, aucTrain, aucCal
))
}
}
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "predchannel_type: trainAUC: 0.625; calibrationAUC: 0.729"
for (v in numericVars) {
pi <- paste('pred', v, sep='')
dTrain[,pi] <- mkPredN(dTrain[[outcome]], dTrain[[v]], dTrain[[v]])
dCal[,pi] <- mkPredN(dCal[[outcome]], dCal[[v]], dCal[[v]])
dTest[,pi] <- mkPredN(dTest[[outcome]], dTest[[v]], dTest[[v]])
aucTrain <- roc(dTrain[[outcome]], dTrain[[pi]])$auc
if (aucTrain >= 0.55) {
aucCal <- roc(dCal[[outcome]], dCal[[pi]])$auc
print(sprintf(
"%s: trainAUC: %4.3f; calibrationAUC: %4.3f",
pi, aucTrain, aucCal
))
}
}
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "predsubscribers: trainAUC: 0.651; calibrationAUC: 0.751"
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "predvideo.views: trainAUC: 0.772; calibrationAUC: 0.853"
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "preduploads: trainAUC: 0.673; calibrationAUC: 0.734"
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "predcreated_year: trainAUC: 0.554; calibrationAUC: 0.715"
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "predcreated_month: trainAUC: 0.603; calibrationAUC: 0.768"
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## Setting levels: control = low, case = high
## Setting direction: controls < cases
## [1] "predcreated_date: trainAUC: 0.559; calibrationAUC: 0.700"
It can be observed that the model’s performance on the calibration
set is generally better than on the training set, which suggests the
presence of overfitting tendencies. To further examine the AUC values,
all independent variables are subjected to 100-fold
cross-validation.
for (var in catVars) {
aucs <- rep(0, 100)
for (rep in 1:length(aucs)) {
useForCalRep <- rbinom(n = nrow(dTrainAll), size = 1, prob = 0.1) > 0
predRep <- mkPredC(dTrainAll[!useForCalRep, outcome],
dTrainAll[!useForCalRep, var],
dTrainAll[useForCalRep, var])
aucs[rep] <- calcAUC(predRep, dTrainAll[useForCalRep, outcome])
}
print(sprintf("%s: mean: %4.3f; sd: %4.3f", var, mean(aucs), sd(aucs)))
}
## [1] "channel_type: mean: 0.585; sd: 0.069"
for (var in numericVars) {
aucs <- rep(0,100)
for (rep in 1:length(aucs)) {
useForCalRep <- rbinom(n=nrow(dTrainAll), size=1, prob=0.1) > 0
predRep <- mkPredN(dTrainAll[!useForCalRep, outcome],
dTrainAll[!useForCalRep, var],
dTrainAll[useForCalRep, var])
aucs[rep] <- calcAUC(predRep, dTrainAll[useForCalRep,outcome])
}
print(sprintf("%s: mean: %4.3f; sd: %4.3f", var, mean(aucs), sd(aucs)))
}
## [1] "subscribers: mean: 0.626; sd: 0.064"
## [1] "video.views: mean: 0.764; sd: 0.050"
## [1] "uploads: mean: 0.636; sd: 0.069"
## [1] "created_year: mean: 0.475; sd: 0.069"
## [1] "created_month: mean: 0.526; sd: 0.076"
## [1] "created_date: mean: 0.471; sd: 0.059"
To validate the calibration set, the results indicate that the
“video.views” variable performs the best in the single-variable models,
with an average AUC value of 0.754 and a standard deviation of 0.063.
Compared to “subscribers,” “video.views” exhibits a higher average AUC
value and a smaller standard deviation. “Subscribers” also performs
relatively well, while the “created_year,” “created_month,” and
“created_date” variables show poorer performance.
Examine the double density plots.
The results from the dual-density curve plot show that, when
predicting probabilities based on “video.views,” a threshold of around
0.4 serves as a boundary. When the predicted probability is less than
0.4, YouTubers are more likely to have high earnings, while
probabilities greater than 0.4 are associated with lower earnings.
Within the predicted probability range of 0.3 to 0.5 for “subscribers,”
the distinction in earnings is not significant. For “uploads,” predicted
probabilities exceeding 0.5 do not exhibit a clear difference in
earnings, but within the range of 0.25 to 0.5, YouTubers are more likely
to have lower earnings. When the predicted probability for “subscribers”
exceeds 0.3, higher earnings are more likely. Other variables related to
the inception time, except for “created_month,” are generally not very
significant.
fig1 <- ggplot(dCal) + geom_density(aes(x = predsubscribers, color = as.factor(average_monthly_earnings_category)))
fig2 <- ggplot(dCal) + geom_density(aes(x = predvideo.views, color = as.factor(average_monthly_earnings_category)))
fig3 <- ggplot(dCal) + geom_density(aes(x = preduploads, color = as.factor(average_monthly_earnings_category)))
fig4 <- ggplot(dCal) + geom_density(aes(x = predcreated_year, color = as.factor(average_monthly_earnings_category)))
fig5 <- ggplot(dCal) + geom_density(aes(x = predcreated_month, color = as.factor(average_monthly_earnings_category)))
fig6 <- ggplot(dCal) + geom_density(aes(x = predcreated_date, color = as.factor(average_monthly_earnings_category)))
fig7 <- ggplot(dCal) + geom_density(aes(x = predchannel_type, color = as.factor(average_monthly_earnings_category)))
heights <- rep(3, 7)
widths <- rep(1, 1)
grid.arrange(fig1, fig2, fig3, fig4, fig5, fig6, fig7, ncol = 1, heights = heights, widths = widths)

Check the ROC curve.
plot_roc <- function(predcol, outcol, colour_id=2, overlaid=F) {
ROCit_obj <- rocit(score=predcol, class=outcol==pos)
par(new=overlaid)
plot(ROCit_obj, col = c(colour_id, 1),
legend = FALSE, YIndex = FALSE, values = FALSE)
}
plot_roc(dTest$predsubscribers, dTest[, outcome], colour_id = 2) #red
plot_roc(dTest$predvideo.views, dTest[, outcome], colour_id = 3, overlaid = TRUE)#green
plot_roc(dTest$preduploads, dTest[, outcome], colour_id = 4, overlaid = TRUE)#blue
plot_roc(dTest$predcreated_month, dTest[, outcome], colour_id = 5, overlaid = TRUE)#Light blue
plot_roc(dTest$predchannel_type, dTest[, outcome], colour_id = 6, overlaid = TRUE)#Magenta

Based on the area under the curve, it can be observed that
‘video.views’ performs best.
1.3 Null model and the best univariate model.
Create a null model.
Npos <- sum(dTrain[,outcome] == pos)
pred.Null <- Npos / nrow(dTrain)
cat("Proportion of outcome == 1 in dTrain:", pred.Null)
## Proportion of outcome == 1 in dTrain: 0.5439469
TP <- 0; TN <- sum(dCal[,outcome] == -1);
FP <- 0; FN <- sum(dCal[,outcome] == 1);
cat("nrow(dCal):", nrow(dCal), "TP:", TP, "TN:", TN, "FP:", FP, "FN:", FN)
## nrow(dCal): 61 TP: 0 TN: 0 FP: 0 FN: 0
(accuracy <- (TP + TN) / nrow(dCal))
## [1] 0
(precision <- TP/(TP + FP))
## [1] NaN
(recall <- TP/(TP + FN))
## [1] NaN
pred.Null <- rep(pred.Null, nrow(dCal))
(AUC <- calcAUC(pred.Null, dCal[,outcome]))
## [1] 0.5
Use log likelihood to select the best univariate model.
pos <- 'high'
# Define a function to compute log likelihood so that we can reuse it.
logLikelihood <- function(ytrue, ypred, epsilon=1e-6) {
sum(ifelse(ytrue==pos, log(ypred+epsilon), log(1-ypred-epsilon)), na.rm=T)
}
# Compute the likelihood of the Null model on the calibration
# set (for the KDD dataset from previous lecture)
outcome <- 'average_monthly_earnings_category'
logNull <- logLikelihood(
dCal[,outcome], sum(dCal[,outcome]==pos)/nrow(dCal)
)
cat(logNull)
## -42.20818
selCatVars <- c()
minDrop <- 10 # may need to adjust this number
for (v in catVars) {
pi <- paste('pred', v, sep='')
devDrop <- 2*(logLikelihood(dCal[,outcome], dCal[,pi]) - logNull)
if (devDrop >= minDrop) {
print(sprintf("%s, deviance reduction: %g", pi, devDrop))
selCatVars <- c(selCatVars, pi)
}
}
## [1] "predchannel_type, deviance reduction: 16.2203"
selNumVars <- c()
minDrop <- 10 # may need to adjust this number
for (v in numericVars) {
pi <- paste('pred', v, sep='')
devDrop <- 2*(logLikelihood(dCal[,outcome], dCal[,pi]) - logNull)
if (devDrop >= minDrop) {
print(sprintf("%s, deviance reduction: %g", pi, devDrop))
selNumVars <- c(selNumVars, pi)
}
}
## [1] "predsubscribers, deviance reduction: 16.4822"
## Warning in log(1 - ypred - epsilon): NaNs produced
## [1] "predvideo.views, deviance reduction: 29.3237"
## [1] "preduploads, deviance reduction: 14.6795"
## [1] "predcreated_year, deviance reduction: 10.3"
## [1] "predcreated_month, deviance reduction: 14.9858"
It’s evident that the “video.views” model is the best univariate
model.
1.4 Multivariate Model - Feature Selection.
Forming a set of “candidate” columns for predictors along with the
target column. Based on extensive literature, we can make hypotheses to
assess potential independent variables and use Chi-square Test to test
if these independent
variables(“video.views”,“channel_type”,“uploads”,“subscribers” and
“created_month”) can impact the classification of the dependent
variable. Select the appropriate feature selection method based on the
different data types of the independent variables.
# Chi-square Test
x<- youtubedata1[, c("channel_type")]
y <- youtubedata1$average_monthly_earnings_category
chi_sq<- chisq.test(x, y)
p_value <- 1 - pchisq(chi_sq$statistic, df = chi_sq$parameter)
p_value
## X-squared
## 2.052656e-07
video_views <- youtubedata1[["video.views"]]
uploads <- youtubedata1[["uploads"]]
subscribers <- youtubedata1[["subscribers"]]
earnings_category <- youtubedata1[["average_monthly_earnings_category"]]
# T-test
t_test_result1 <- t.test(video_views ~ earnings_category)
t_test_result2 <- t.test(uploads ~ earnings_category)
t_test_result3 <- t.test(subscribers ~ earnings_category)
print(t_test_result1)
##
## Welch Two Sample t-test
##
## data: video_views by earnings_category
## t = -10.397, df = 663.62, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group low and group high is not equal to 0
## 95 percent confidence interval:
## -10416062115 -7106830720
## sample estimates:
## mean in group low mean in group high
## 6518629286 15280075704
print(t_test_result2)
##
## Welch Two Sample t-test
##
## data: uploads by earnings_category
## t = -4.776, df = 672.97, p-value = 2.196e-06
## alternative hypothesis: true difference in means between group low and group high is not equal to 0
## 95 percent confidence interval:
## -14335.974 -5982.593
## sample estimates:
## mean in group low mean in group high
## 4132.587 14291.870
print(t_test_result3)
##
## Welch Two Sample t-test
##
## data: subscribers by earnings_category
## t = -6.0653, df = 854.71, p-value = 1.976e-09
## alternative hypothesis: true difference in means between group low and group high is not equal to 0
## 95 percent confidence interval:
## -8806150 -4500196
## sample estimates:
## mean in group low mean in group high
## 19522418 26175591
# ANOVA
created_month <- youtubedata1[["created_month"]]
earnings_category <- youtubedata1[["average_monthly_earnings_category"]]
anova_model <- aov(created_month ~ earnings_category)
anova_result <- summary(anova_model)
print(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## earnings_category 1 1 1.002 0.084 0.772
## Residuals 961 11509 11.976
The results of feature analysis indicate that ‘channel_type,’
‘video.views,’ ‘uploads,’ and ‘subscribers’ significantly influence the
level of average monthly earnings, while ‘created_month’ does not have a
significant impact on average monthly earnings. Therefore,
‘created_month’ is excluded as a feature.
1.5 Multivariate Model - Decision tree classifier
Create a multivariate model and examine the statistical analysis
results.We will start by building a model using the best-performing
“video.views” ,“subscribers” and “channel_type” variables.
Split the data and create a multivariate module
Split the dataset into training and testing sets using the
sample.split() function. Create a multivariate model and examine the
statistical analysis results.We will start by building a model using the
best-performing “video.views” and “channel_type” variables.
set.seed(123)
split <- sample.split(youtubedata1$average_monthly_earnings_category, SplitRatio = 0.7)
training_data1 <- youtubedata1[split, ]
testing_data1 <- youtubedata1[!split, ]
# Compare the levels of the 'channel_type' variable between the training and testing data.
train_levels <- levels(training_data1$channel_type)
test_levels <- levels(testing_data1$channel_type)
if(identical(train_levels, test_levels)) {
print("The 'channel_type' variable in both the training and testing data contains the same levels.")
} else {
print("The 'channel_type' variable in both the training and testing data contains the same levels.")
}
## [1] "The 'channel_type' variable in both the training and testing data contains the same levels."
channel_type_data <- youtubedata1[,c("channel_type","video.views","average_monthly_earnings_category")]
tree_model <- rpart(average_monthly_earnings_category ~ ., data = channel_type_data)
summary(tree_model)
## Call:
## rpart(formula = average_monthly_earnings_category ~ ., data = channel_type_data)
## n= 963
##
## CP nsplit rel error xerror xstd
## 1 0.4593407 0 1.0000000 1.000000 0.03404968
## 2 0.0100000 1 0.5406593 0.556044 0.03001683
##
## Variable importance
## video.views channel_type
## 87 13
##
## Node number 1: 963 observations, complexity param=0.4593407
## predicted class=high expected loss=0.4724818 P(node) =1
## class counts: 455 508
## probabilities: 0.472 0.528
## left son=2 (433 obs) right son=3 (530 obs)
## Primary splits:
## video.views < 7041031000 to the left, improve=113.74000, (0 missing)
## channel_type splits as RLLRLRLLRRLLLL, improve= 17.76451, (0 missing)
## Surrogate splits:
## channel_type splits as RLRRRRLLRRRLRL, agree=0.617, adj=0.148, (0 split)
##
## Node number 2: 433 observations
## predicted class=low expected loss=0.2586605 P(node) =0.4496366
## class counts: 321 112
## probabilities: 0.741 0.259
##
## Node number 3: 530 observations
## predicted class=high expected loss=0.2528302 P(node) =0.5503634
## class counts: 134 396
## probabilities: 0.253 0.747
According to the summary information of the decision tree model, the
model’s complexity parameter (CP) is 0.4593, indicating a moderate level
of model complexity. The root node predicts the category as ‘high’ and
splits based on the features of the observations. The model forms two
child nodes, with the left child node tending to classify observations
as ‘low,’ while the right child node leans towards ‘high.’ Particularly,
‘video.views’ plays the most significant role in the model’s
classification decisions with an importance value of 87, followed by
‘channel_type’ (13). These results provide insights into how the model
operates and the importance of different features, serving as a starting
point for further analysis.
Evaluatethe multivariate module
Evaluate the various metrics of this model.
predictions <- predict(tree_model, testing_data1, type = "class")
length(predictions)
## [1] 289
accuracy <- mean(predictions == testing_data1$average_monthly_earnings_category)
cat("Accuracy: ", accuracy, "\n")
## Accuracy: 0.7404844
confusion_matrix <- table(predictions, testing_data1$average_monthly_earnings_category)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
##
## predictions low high
## low 101 39
## high 36 113
TP <- confusion_matrix[2, 2]
FP <- confusion_matrix[1, 2]
FN <- confusion_matrix[2, 1]
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Precision: ", precision, "\n")
## Precision: 0.7434211
cat("Recall: ", recall, "\n")
## Recall: 0.7583893
cat("F1 Score: ", f1_score, "\n")
## F1 Score: 0.7508306
predictions <- predict(tree_model, testing_data1, type = "vector")
true_labels <- ifelse(testing_data1$average_monthly_earnings_category == "high", 1, 0)
pred_obj <- prediction(predictions, true_labels)
perf <- performance(pred_obj, "tpr", "fpr")
# Calculate AUC
auc_value <- performance(pred_obj, "auc")
cat("AUC Value: ", as.numeric(auc_value@y.values), "\n")
## AUC Value: 0.7403237
In terms of distinguishing between high and low categories, all
values related to model evaluation are greater than 0.74. This indicates
that the model performs well with high accuracy and the ability to
effectively capture true positives while maintaining a high level of
precision. The model can be used for classifying instances into high and
low categories and demonstrates relatively good performance.
Draw ROC Curve
The results show that the ROC curve indicates a good model fit.
predictions <- predict(tree_model, testing_data1, type = "vector")
true_labels <- ifelse(testing_data1$average_monthly_earnings_category == "high", 1, 0)
pred_obj <- prediction(predictions, true_labels)
# Create a performance object to calculate ROC
perf <- performance(pred_obj, "tpr", "fpr")
# Plot the ROC curve
plot(perf, colorize = TRUE, print.cutoffs.at = seq(0, 1, 0.1), text.adj = c(-0.2, 1.7))
legend("bottomright", legend = c("Model"), col = 1, lwd = 2)

Create a double density plot.
The results show that below a probability of 0.5, it is more likely
to predict low income, while above 0.5 in probability, it is more likely
to predict high income.
normalized_predictions <- (predictions - min(predictions)) / (max(predictions) - min(predictions))
roc_data <- data.frame(Predicted = normalized_predictions, Actual = true_labels)
# Normalize predictions to the range [0, 1]
normalized_predictions <- (predictions - min(predictions)) / (max(predictions) - min(predictions))
roc_data$Predicted <- as.numeric(normalized_predictions)
fig1 <- ggplot(roc_data) +
geom_density(aes(x = Predicted, fill = factor(Actual), color = factor(Actual)), alpha = 0.5) +
xlab("Predicted Probabilities") +
ylab("Density")
# Plot the density curves with normalized predictions
print(fig1)

Add the variable “subscribers”
Incorporating “subscribers” to see if we can still construct an
effective model.
channel_type_data_n <- youtubedata1[,c("channel_type","video.views","subscribers","average_monthly_earnings_category")]
tree_model_n <- rpart(average_monthly_earnings_category ~ ., data = channel_type_data_n)
summary(tree_model_n)
## Call:
## rpart(formula = average_monthly_earnings_category ~ ., data = channel_type_data_n)
## n= 963
##
## CP nsplit rel error xerror xstd
## 1 0.4593407 0 1.0000000 1.0000000 0.03404968
## 2 0.0100000 1 0.5406593 0.5450549 0.02982321
##
## Variable importance
## video.views subscribers channel_type
## 68 22 10
##
## Node number 1: 963 observations, complexity param=0.4593407
## predicted class=high expected loss=0.4724818 P(node) =1
## class counts: 455 508
## probabilities: 0.472 0.528
## left son=2 (433 obs) right son=3 (530 obs)
## Primary splits:
## video.views < 7041031000 to the left, improve=113.74000, (0 missing)
## subscribers < 21150000 to the left, improve= 36.33391, (0 missing)
## channel_type splits as RLLRLRLLRRLLLL, improve= 17.76451, (0 missing)
## Surrogate splits:
## subscribers < 16050000 to the left, agree=0.698, adj=0.328, (0 split)
## channel_type splits as RLRRRRLLRRRLRL, agree=0.617, adj=0.148, (0 split)
##
## Node number 2: 433 observations
## predicted class=low expected loss=0.2586605 P(node) =0.4496366
## class counts: 321 112
## probabilities: 0.741 0.259
##
## Node number 3: 530 observations
## predicted class=high expected loss=0.2528302 P(node) =0.5503634
## class counts: 134 396
## probabilities: 0.253 0.747
the model’s complexity parameter (CP) is 0.4593. The values are the
same as in the previous bivariate model, indicating a moderate level of
model complexity. ‘video.views’ plays the most significant role in the
model’s classification decisions with an importance value of 68,
followed by ‘subscribers’ (22) and ‘channel_type’ (13).
Evaluate the various metrics of this model.
predictions_n <- predict(tree_model_n, testing_data1, type = "class")
length(predictions_n)
## [1] 289
accuracy <- mean(predictions_n == testing_data1$average_monthly_earnings_category)
cat("Accuracy: ", accuracy, "\n")
## Accuracy: 0.7404844
confusion_matrix <- table(predictions_n, testing_data1$average_monthly_earnings_category)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
##
## predictions_n low high
## low 101 39
## high 36 113
# Calculate precision, recall, and F1 score
TP <- confusion_matrix[2, 2]
FP <- confusion_matrix[1, 2]
FN <- confusion_matrix[2, 1]
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Precision: ", precision, "\n")
## Precision: 0.7434211
cat("Recall: ", recall, "\n")
## Recall: 0.7583893
cat("F1 Score: ", f1_score, "\n")
## F1 Score: 0.7508306
predictions <- predict(tree_model_n, testing_data1, type = "vector")
true_labels <- ifelse(testing_data1$average_monthly_earnings_category == "high", 1, 0)
pred_obj <- prediction(predictions, true_labels)
# Create a performance object to calculate ROC
perf <- performance(pred_obj, "tpr", "fpr")
# Calculate AUC
auc_value <- performance(pred_obj, "auc")
cat("AUC Value: ", as.numeric(auc_value@y.values), "\n")
## AUC Value: 0.7403237
The consistent results across different values suggest that the
relationship between the “subscribers” variable and the target variable
is not sufficiently strong to significantly improve the model’s
performance.
1.6 Multivariate Model - K-Nearest Neighbours
Continue to use “video.views” ,“uploads” and “channel_type” as
predictor variables.
k <- 15
features <- c("subscribers", "video.views","uploads")
# Train the k-NN model
knn_model <- knn(train = training_data1[, features], test = testing_data1[, features], cl = training_data1$average_monthly_earnings_category, k = k)
accuracy <- mean(knn_model == testing_data1$average_monthly_earnings_category)
cat("Accuracy: ", accuracy, "\n")
## Accuracy: 0.733564
confusion_matrix <- table(knn_model, testing_data1$average_monthly_earnings_category)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(confusion_matrix)
##
## knn_model low high
## low 102 42
## high 35 110
TP <- confusion_matrix[2, 2]
FP <- confusion_matrix[1, 2]
FN <- confusion_matrix[2, 1]
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("Precision: ", precision, "\n")
## Precision: 0.7236842
cat("Recall: ", recall, "\n")
## Recall: 0.7586207
cat("F1 Score: ", f1_score, "\n")
## F1 Score: 0.7407407
predictions <- as.numeric(knn_model == "high") # Assuming "high" is the positive class
true_labels <- as.numeric(testing_data1$average_monthly_earnings_category == "high")
pred_obj <- prediction(predictions, true_labels)
perf <- performance(pred_obj, "tpr", "fpr")
auc_value <- performance(pred_obj, "auc")
cat("AUC Value: ", auc_value@y.values[[1]], "\n")
## AUC Value: 0.7341049
In summary, the K-Nearest Neighbors (KNN) model performs well in the
classification task, demonstrating high accuracy and reasonable
precision and recall.
Draw the ROC Curve
The results show that the ROC curve indicates a good model fit.
pred_obj <- prediction(predictions, true_labels)
perf <- performance(pred_obj, "tpr", "fpr")
plot(perf, colorize = TRUE)
legend("bottomright", legend = c("Model"), col = 1, lwd = 2)

Plot the density curves
fig1 <- ggplot(roc_data) +
geom_density(aes(x = Predicted, fill = factor(Actual), color = factor(Actual)), alpha = 0.5) +
xlab("Predicted Probabilities") +
ylab("Density") +
scale_fill_manual(values = c("blue", "red")) +
scale_color_manual(values = c("blue", "red")) +
theme_minimal()
# Plot the density curves
print(fig1)

The results displayed in this graph are similar to the results of the
decision tree dual-density plot. The results show that below a
probability of 0.5, it is more likely to predict low income, while above
0.5 in probability, it is more likely to predict high income.
1.7 Model Comparison and Evaluation.
Based on the comparison of accuracy value, prediction value, recall
value, F1 score, and AUC value, it can be observed that the decision
tree model fits better with respective values of 0.740, 0.743, 0.758,
0.750, and 0.740. In contrast, the K-Nearest Neighbors model has values
of 0.734, 0.724, 0.759, 0.740, and 0.734, slightly lower than the
decision tree. However, overall, both models exhibit similar fitting
performance, and both perform well.
1.8 Using LIME to explain XGBoost model
Create a XGBoost model and check the values related to model
evaluation and use LIME to explain the model.
youtubedata1$channel_type <- as.factor(youtubedata1$channel_type)
youtubedata1$channel_type <- as.numeric(youtubedata1$channel_type)
# Set seed for reproducibility
set.seed(123)
split <- sample.split(youtubedata1$average_monthly_earnings_category, SplitRatio = 0.7)
training_data1 <- youtubedata1[split, ]
testing_data1 <- youtubedata1[!split, ]
features <- c("channel_type", "video.views",'subscribers')
target_variable <- "average_monthly_earnings_category"
train_matrix <- as.matrix(training_data1[, features])
test_matrix <- as.matrix(testing_data1[, features])
label_vector <- ifelse(training_data1$average_monthly_earnings_category == "high", 1, 0)
cat("Data type of variable_matrix: ", class(train_matrix), "\n")
## Data type of variable_matrix: matrix array
cat("Data type of labelvec: ", class(label_vector), "\n")
## Data type of labelvec: numeric
fit_iris_example = function(variable_matrix, labelvec) {
cv <- xgb.cv(variable_matrix, label = labelvec,
params=list(
objective="binary:logistic"
),
nfold=5,
nrounds=100,
print_every_n=10,
metrics="logloss")
evalframe <- as.data.frame(cv$evaluation_log)
NROUNDS <- which.min(evalframe$test_logloss_mean)
model <- xgboost(data=variable_matrix, label=labelvec,
params=list(
objective="binary:logistic"
),
nrounds=NROUNDS,
verbose=FALSE)
model
}
model <- fit_iris_example(train_matrix, label_vector)
## [1] train-logloss:0.597899+0.003012 test-logloss:0.634807+0.009348
## [11] train-logloss:0.355487+0.016714 test-logloss:0.611853+0.052903
## [21] train-logloss:0.299579+0.012043 test-logloss:0.643771+0.065567
## [31] train-logloss:0.259564+0.015731 test-logloss:0.671163+0.083254
## [41] train-logloss:0.225813+0.012342 test-logloss:0.691175+0.094234
## [51] train-logloss:0.199241+0.013845 test-logloss:0.716176+0.097553
## [61] train-logloss:0.179809+0.009452 test-logloss:0.735592+0.099557
## [71] train-logloss:0.168270+0.008377 test-logloss:0.748314+0.104175
## [81] train-logloss:0.155626+0.008458 test-logloss:0.768456+0.107509
## [91] train-logloss:0.139870+0.007076 test-logloss:0.783039+0.112420
## [100] train-logloss:0.128259+0.005499 test-logloss:0.800747+0.112117
predictions <- predict(model, test_matrix)
accuracy <- mean(predictions > 0.5) # Assuming threshold of 0.5
cat("Accuracy: ", accuracy, "\n")
## Accuracy: 0.5432526
confusion_matrix <- table(predictions = (predictions > 0.5), actual = testing_data1$average_monthly_earnings_category == "high")
confusion_matrix <- confusionMatrix(confusion_matrix)
precision <- confusion_matrix$byClass["Pos Pred Value"]
recall <- confusion_matrix$byClass["Sensitivity"]
f1_score <- confusion_matrix$byClass["F1"]
cat("Precision: ", precision, "\n")
## Precision: 0.719697
cat("Recall: ", recall, "\n")
## Recall: 0.6934307
cat("F1 Score: ", f1_score, "\n")
## F1 Score: 0.7063197
labels <- testing_data1$average_monthly_earnings_category == "high" # Replace with the correct label condition
predictions_obj <- prediction(predictions, labels)
perf <- performance(predictions_obj, "tpr", "fpr")
auc_value <- performance(predictions_obj, "auc")
cat("AUC Value: ", as.numeric(auc_value@y.values), "\n")
## AUC Value: 0.7434691
explainer <- lime(training_data1[features], model = model,
bin_continuous = TRUE, n_bins = 10)
cases <- c(3,11,21,30)
(example <- testing_data1[cases,features])
## channel_type video.views subscribers
## 5 5 148000000000 159000000
## 28 9 29533230328 61000000
## 56 5 7828610828 44700000
## 93 5 18208196857 37600000
explanation <- lime::explain(example, explainer, n_labels = 1, n_features = 4)
plot_features(explanation)

LIME has generated a series of plots, among which the feature weight
plot shows that “video.views” has the highest weight, contributing the
most to the model’s predictions, followed by “subscribers” and
“channel_type”. The case probability plot reveals that when the case is
28, the probability is highest, reaching 0.88. However, as the case
number increases beyond 28, the probability decreases. Support plots and
contrast plots demonstrate that only the “channel_type” variable
exhibits a contrasting effect at the case of 28, while the others
provide support.
2 Task 2 - Unemployment.rate
2.1 Choose the response (target) variable - Unemployment.rate
The selected response (target) variables is unemployment rate . The
feature variables are
“Abbreviation”,“Urban_population”,“Population”,“Gross.tertiary.education.enrollment….”
2.2 Data Processing and Preparation
apply(is.na(youtubedataT2), 2, sum)
## rank Youtuber
## 0 0
## subscribers video.views
## 0 0
## category Title
## 46 0
## uploads Country
## 0 122
## Abbreviation channel_type
## 122 30
## video_views_rank country_rank
## 1 116
## channel_type_rank video_views_for_the_last_30_days
## 33 56
## lowest_monthly_earnings highest_monthly_earnings
## 0 0
## lowest_yearly_earnings highest_yearly_earnings
## 0 0
## subscribers_for_last_30_days created_year
## 337 5
## created_month created_date
## 5 5
## Gross.tertiary.education.enrollment.... Population
## 132 123
## Unemployment.rate Urban_population
## 123 123
## Latitude Longitude
## 123 123
Based on this result, it was found that both the country and
abbreviation columns have 122 missing values. To check if it’s possible
to combine the country and its abbreviation, let’s examine if the
locations of missing values in these two columns are the same.
count_suspected_na_rows <- sum(rowSums(is.na(youtubedataT2[, c("Country", "Abbreviation")])) == 2)
print(count_suspected_na_rows)
## [1] 122
As obtained in the previous analysis, the missing values in the
columns of Population, Unemployment Rate, and Urban Population are also
the same. To verify if these missing values correspond to the same rows,
it helps determine whether they can be considered as redundant data and
be directly removed.
count_suspected_na_rows <- sum(rowSums(is.na(youtubedataT2[, c("Population", "Unemployment.rate", "Urban_population")])) == 3)
print(count_suspected_na_rows)
## [1] 123
The results show that the missing positions are the same and can be
deleted together.
Examine the relationship between country and the missing values in
the unemployment rate.
unemployment_na_rows <- which(is.na(youtubedataT2$Unemployment.rate))
country_na_rows <- which(is.na(youtubedataT2$Country))
common_na_rows <- intersect(unemployment_na_rows, country_na_rows)
different_na_rows <- setdiff(union(unemployment_na_rows, country_na_rows), common_na_rows)
cat("Same rows with NA values:", length(common_na_rows), "\n")
## Same rows with NA values: 122
cat("Same rows with NA values:", length(different_na_rows), "\n")
## Same rows with NA values: 1
Here it was found that one row is different; let’s check the
situation for this particular row.
different_row_number <- different_na_rows[1]
different_row_data <- youtubedataT2[different_row_number, c("Country", "Population", "Unemployment.rate", "Urban_population")]
print(different_row_data)
## Country Population Unemployment.rate Urban_population
## 664 Andorra NA NA NA
This row is invalid. Therefore, we will construct a new data frame by
removing all rows with NA values in the Unemployment Rate column.
datanew <- youtubedataT2 %>%
select(Country, Population, Gross.tertiary.education.enrollment...., Unemployment.rate, Urban_population) %>%
filter(!is.na(Unemployment.rate))
summary(datanew)
## Country Population Gross.tertiary.education.enrollment....
## Length:872 Min. :2.025e+05 Min. : 7.60
## Class :character 1st Qu.:8.336e+07 1st Qu.:36.30
## Mode :character Median :3.282e+08 Median :68.00
## Mean :4.304e+08 Mean :63.11
## 3rd Qu.:3.282e+08 3rd Qu.:88.20
## Max. :1.398e+09 Max. :94.30
## NA's :9
## Unemployment.rate Urban_population
## Min. : 0.750 Min. : 35588
## 1st Qu.: 5.270 1st Qu.: 55908316
## Median : 9.365 Median :270663028
## Mean : 9.279 Mean :224214982
## 3rd Qu.:14.700 3rd Qu.:270663028
## Max. :14.720 Max. :842933962
##
We have identified 9 observations with missing values in the
Education Rate column. Let’s print them:
na_rows <- datanew[is.na(datanew$Gross.tertiary.education.enrollment....), ]
num_na_rows <- which(is.na(datanew$Gross.tertiary.education.enrollment....))
print(na_rows)
## Country Population Gross.tertiary.education.enrollment....
## 162 Australia 25766605 NA
## 208 Australia 25766605 NA
## 319 Australia 25766605 NA
## 407 Australia 25766605 NA
## 434 Australia 25766605 NA
## 447 Australia 25766605 NA
## 450 Australia 25766605 NA
## 611 Australia 25766605 NA
## 682 Australia 25766605 NA
## Unemployment.rate Urban_population
## 162 5.27 21844756
## 208 5.27 21844756
## 319 5.27 21844756
## 407 5.27 21844756
## 434 5.27 21844756
## 447 5.27 21844756
## 450 5.27 21844756
## 611 5.27 21844756
## 682 5.27 21844756
All of them are from Australia, which we can consider as systematic
missing values. We will fill them with the median value.
median_GTEE <- median(datanew$Gross.tertiary.education.enrollment...., na.rm = TRUE)
datanew$Gross.tertiary.education.enrollment....[is.na(datanew$Gross.tertiary.education.enrollment....)] <- median_GTEE
Although there are many data points in the table, for the same
country, the values of tertiary_education, population, and
urban_population are the same. We will create a new data frame that
includes only unique rows.
data <- unique(datanew)
summary(data)
## Country Population Gross.tertiary.education.enrollment....
## Length:48 Min. :2.025e+05 Min. : 7.60
## Class :character 1st Qu.:1.583e+07 1st Qu.:35.80
## Mode :character Median :4.185e+07 Median :57.45
## Mean :1.200e+08 Mean :55.17
## 3rd Qu.:9.744e+07 3rd Qu.:72.85
## Max. :1.398e+09 Max. :94.30
## Unemployment.rate Urban_population
## Min. : 0.750 Min. : 35588
## 1st Qu.: 3.743 1st Qu.: 9651217
## Median : 5.315 Median : 30732090
## Mean : 6.507 Mean : 69730921
## 3rd Qu.: 9.193 3rd Qu.: 57178091
## Max. :14.720 Max. :842933962
str(data)
## 'data.frame': 48 obs. of 5 variables:
## $ Country : chr "India" "United States" "Japan" "Russia" ...
## $ Population : num 1.37e+09 3.28e+08 1.26e+08 1.44e+08 5.17e+07 ...
## $ Gross.tertiary.education.enrollment....: num 28.1 88.2 63.2 81.9 94.3 60 68.9 51.3 90 88.5 ...
## $ Unemployment.rate : num 5.36 14.7 2.29 4.59 4.15 ...
## $ Urban_population : num 4.71e+08 2.71e+08 1.16e+08 1.08e+08 4.21e+07 ...
df <- data
At this point, we have completed the initial data cleaning, removed
missing values and outliers, and obtained a cleaned dataset ready for
machine learning.
2.3 Single variable models
Binary Classification of the Target Variable
Based on the data characteristics, we have chosen the mean value as
the classification threshold.
mean_unemployment_rate <- mean(data$Unemployment.rate)
data$Unemployment <- ifelse(data$Unemployment.rate > mean_unemployment_rate, "1", "-1")
data <- data %>% select(-Unemployment.rate)
print(table(data$Unemployment))
##
## -1 1
## 28 20
summary(data)
## Country Population Gross.tertiary.education.enrollment....
## Length:48 Min. :2.025e+05 Min. : 7.60
## Class :character 1st Qu.:1.583e+07 1st Qu.:35.80
## Mode :character Median :4.185e+07 Median :57.45
## Mean :1.200e+08 Mean :55.17
## 3rd Qu.:9.744e+07 3rd Qu.:72.85
## Max. :1.398e+09 Max. :94.30
## Urban_population Unemployment
## Min. : 35588 Length:48
## 1st Qu.: 9651217 Class :character
## Median : 30732090 Mode :character
## Mean : 69730921
## 3rd Qu.: 57178091
## Max. :842933962
Splitting the Dataset
Since the dataset is relatively small, we choose to split the data
into a training set and a test set, while also setting aside a
calibration set for better model tuning.
d <- data %>%
select(-Country)
set.seed(2333)
d$rg <- runif(dim(d)[1])
dTrainAll <- subset(d, rg<=0.7)
dTest <- subset(d, rg>0.7)
vars <- setdiff(colnames(dTrainAll), c('rg', 'Unemployment'))
# split dTrainAll into a training set and a validation (or calibration) set
useForCal <- rbinom(n=dim(dTrainAll)[1], size=1, prob=0.1)>0
dCal <- subset(dTrainAll, useForCal)
dTrain <- subset(dTrainAll, !useForCal)
Print the three subsets and check if each subset contains values for
the target variable as ‘1’ and ‘-1’:
table(dTrain[, 'Unemployment'])
##
## -1 1
## 15 12
table(dCal[, 'Unemployment'])
##
## -1 1
## 3 2
table(dTest[, 'Unemployment'])
##
## -1 1
## 10 6
Build univariate models for numeric variables.
As mentioned in the lecture, we construct the mkPredC function and
mkPredN function for numeric variables.
outcome <- 'Unemployment'
pos <- '1'
mkPredC <- function(outCol, varCol, appCol) {
pPos <- sum(outCol == pos) / length(outCol)
naTab <- table(as.factor(outCol[is.na(varCol)]))
pPosWna <- (naTab/sum(naTab))[pos]
vTab <- table(as.factor(outCol), varCol)
#print(vTab)
pPosWv <- (vTab[pos, ] + 1.0e-3*pPos) / (colSums(vTab) + 1.0e-3)
pred <- pPosWv[appCol]
pred[is.na(appCol)] <- pPosWna
pred[is.na(pred)] <- pPos
pred
}
mkPredN <- function(outCol, varCol, appCol) {
# compute the cuts
cuts <- unique(
quantile(varCol, probs=seq(0, 1, 0.1), na.rm=T))
# discretize the numerical columns
varC <- cut(varCol,cuts)
appC <- cut(appCol,cuts)
mkPredC(outCol,varC,appC)
}
calcAUC <- function(predcol,outcol) {
perf <- performance(prediction(predcol,outcol == pos),'auc')
as.numeric(perf@y.values)
}
Process all variables.
for(v in vars) {
pi <- paste('pred', v, sep='')
dTrain[,pi] <- mkPredN(dTrain[,outcome], dTrain[,v], dTrain[,v])
dCal[,pi] <- mkPredN(dTrain[,outcome], dTrain[,v], dCal[,v])
dTest[,pi] <- mkPredN(dTrain[,outcome], dTrain[,v], dTest[,v])
aucTrain <- calcAUC(dTrain[,pi], dTrain[,outcome])
if(aucTrain >= 0.55) {
aucCal <- calcAUC(dCal[,pi], dCal[,outcome])
print(sprintf(
"%s: trainAUC: %4.3f; calibrationAUC: %4.3f",
pi, aucTrain, aucCal))
}
}
## [1] "predPopulation: trainAUC: 0.872; calibrationAUC: 0.667"
## [1] "predGross.tertiary.education.enrollment....: trainAUC: 0.864; calibrationAUC: 1.000"
## [1] "predUrban_population: trainAUC: 0.828; calibrationAUC: 0.500"
Univariate model evaluation.
Further check the AUC values using 100-fold cross-validation. Since
the dataset is small, there is a risk that random splits may result in
one group becoming a single classification problem, so we include a loop
condition to ensure that each group is always binary.
for (var in vars) {
aucs <- rep(0,100)
for (rep in 1:length(aucs)) {
useForCalRep <- rbinom(n=nrow(dTrainAll), size=1, prob=0.1) > 0
# Check if the training subset contains only one class
while (length(unique(dTrainAll[useForCalRep, outcome])) < 2) {
useForCalRep <- rbinom(n = nrow(dTrainAll), size = 1, prob = 0.1) > 0
}
predRep <- mkPredN(dTrainAll[!useForCalRep, outcome],
dTrainAll[!useForCalRep, var],
dTrainAll[useForCalRep, var])
aucs[rep] <- calcAUC(predRep, dTrainAll[useForCalRep,outcome])
}
print(sprintf("%s: mean: %4.3f; sd: %4.3f", var, mean(aucs), sd(aucs)))
}
## [1] "Population: mean: 0.615; sd: 0.333"
## [1] "Gross.tertiary.education.enrollment....: mean: 0.683; sd: 0.322"
## [1] "Urban_population: mean: 0.391; sd: 0.294"
Based on the results, the “Population” variable has an average AUC
value of 0.615 with a standard deviation of 0.333. This means that
models built using the “Population” variable have average performance
and may exhibit significant fluctuations in performance. However, the
average AUC value of this variable is the closest to the original
calibration set results.
The “Gross.tertiary.education.enrollment….” variable has an average
AUC value of 0.683 with a standard deviation of 0.322. This indicates
that models constructed using the
“Gross.tertiary.education.enrollment….” variable have relatively high
average performance, but there is still considerable variability in
performance across different cross-validations. In the previous
analysis, we observed that this variable performed better on the
calibration set than on the training set, but in this analysis, we see
that the average AUC value is still lower than that of the training set.
The seemingly perfect results are due to the high variability of random
effects. Nevertheless, the original AUC value is already close to the
maximum, indicating that the model’s performance is quite
commendable.
“Urban_population” has an average AUC value of 0.391 with a standard
deviation of 0.294. Models built using this variable exhibit lower
average performance, and performance fluctuates relatively more.
Overall, the “Gross.tertiary.education.enrollment….” variable appears
to perform the best among these univariate models, with the highest
average AUC value. The “Urban_population” variable has relatively lower
performance.
Double density plot:
fig1 <- ggplot(dCal) + geom_density(aes(x=predPopulation, color=as.factor(Unemployment)))
fig2 <- ggplot(dCal) + geom_density(aes(x=predGross.tertiary.education.enrollment...., color=as.factor(Unemployment)))
fig3 <- ggplot(dCal) + geom_density(aes(x=predUrban_population, color=as.factor(Unemployment)))
library(gridExtra)
heights <- c(3, 3, 3)
widths <- c(3)
grid.arrange(fig1, fig2, fig3, ncol = 1, heights = heights, widths = widths)

The analysis shows that for all three variables, the heights of the
negative curves are significantly higher than the positive curves in
most regions. This may suggest that the model is more accurate in
predicting the negative category within these value ranges.
Additionally, we can observe that when the predicted values of the
variables are relatively high, the accuracy of predicting the positive
variable tends to be better. This trend is present for all three
variables. The “Population” variable exhibits more overlap between the
positive and negative curves, indicating that it might be challenging to
distinguish between these two categories within these value ranges.
Overall, the double density plots show that the performance of all three
variables is quite similar.
Finally, let’s take a look at the ROC curves for the test set:
plot_roc <- function(predcol, outcol, colour_id=2, overlaid=F) {
ROCit_obj <- rocit(score=predcol, class=outcol==pos)
par(new=overlaid)
plot(ROCit_obj, col = c(colour_id, 1),
legend = FALSE, YIndex = FALSE, values = FALSE)
}
calcAUC(dTrain[,"predPopulation"], dTrain[,outcome]);calcAUC(dCal[,"predPopulation"], dCal[,outcome]);calcAUC(dTest[,"predPopulation"], dTest[,outcome]);
## [1] 0.8722222
## [1] 0.6666667
## [1] 0.4166667
calcAUC(dTrain[,"predGross.tertiary.education.enrollment...."], dTrain[,outcome]);calcAUC(dCal[,"predGross.tertiary.education.enrollment...."], dCal[,outcome]);calcAUC(dTest[,"predGross.tertiary.education.enrollment...."], dTest[,outcome]);
## [1] 0.8638889
## [1] 1
## [1] 0.4166667
calcAUC(dTrain[,"predUrban_population"], dTrain[,outcome]);calcAUC(dCal[,"predUrban_population"], dCal[,outcome]);calcAUC(dTest[,"predUrban_population"], dTest[,outcome]);
## [1] 0.8277778
## [1] 0.5
## [1] 0.625
plot_roc(dTest$predPopulation, dTest[,outcome]) #red
plot_roc(dTest$predGross.tertiary.education.enrollment...., dTest[,outcome], colour_id=3, overlaid=T) #green
plot_roc(dTest$predUrban_population, dTest[,outcome], colour_id=4, overlaid=T) #blue

The ROC curve for the “Urban_population” variable clearly shows the
best performance, with the largest area under the curve (AUC) in the
test set, while the other two variables have similar AUC values.
Therefore, in the overall evaluation, “Urban_population” performs
best on the test set, but it exhibits poorer performance in
cross-validation. On the other hand,
“Gross.tertiary.education.enrollment….” performs best in
cross-validation, and its double density plots and ROC curves are also
decent. Thus, we can conclude that
“Gross.tertiary.education.enrollment….” is the best single-variable
model.
2.4 Null Model Analysis
###Construction and Evaluation of Null Models
Npos <- sum(dTrain[,outcome] == pos)
pred.Null <- Npos / nrow(dTrain)
cat("Proportion of outcome == 1 in dTrain:", pred.Null)
## Proportion of outcome == 1 in dTrain: 0.4444444
Because pred.Null is 0.4444444, we set the threshold to 0.5.
Therefore, the model will predict all as ‘negative.’ Evaluate the
validation set and calculate the relevant evaluation metrics.
TP <- 0; TN <- sum(dCal[,outcome] == -1);
FP <- 0; FN <- sum(dCal[,outcome] == 1);
cat("nrow(dCal):", nrow(dCal), "TP:", TP, "TN:", TN, "FP:", FP, "FN:", FN)
## nrow(dCal): 5 TP: 0 TN: 3 FP: 0 FN: 2
(accuracy <- (TP + TN) / nrow(dCal))
## [1] 0.6
(precision <- TP/(TP + FP))
## [1] NaN
(recall <- TP/(TP + FN))
## [1] 0
pred.Null <- rep(pred.Null, nrow(dCal))
(AUC <- calcAUC(pred.Null, dCal[,outcome]))
## [1] 0.5
After calculating, the AUC value of the null model on the validation
set is 0.5. In contrast, the AUC values of our univariate model on the
validation set are all greater than or equal to 0.5. This demonstrates
that the performance of the univariate model is superior to the null
model. Subsequently, we can use these two models to evaluate other
complex models.
2.5 Multivariate Model-Feature Selection
We use log-likelihood to further perform feature selection.
logLikelihood <- function(ytrue, ypred, epsilon=1e-6) {
sum(ifelse(ytrue==pos, log(ypred+epsilon), log(1-ypred-epsilon)), na.rm=T)
}
Following the calculation method of log-likelihood, we first perform
the calculation for the null model.
logNull <- logLikelihood(
dCal[,outcome], sum(dCal[,outcome]==pos)/nrow(dCal)
)
cat(logNull)
## -3.365058
Next, we iterate through all numerical variables and select variables
based on the extent to which they reduce bias relative to zero.
selVars <- c()
for (v in vars) {
pi <- paste('pred', v, sep='')
devDrop <- 2*(logLikelihood(dCal[,outcome], dCal[,pi]) - logNull)
print(sprintf("%s, deviance reduction: %g", pi, devDrop))
selVars <- c(selVars, pi)
}
## [1] "predPopulation, deviance reduction: 0.1391"
## [1] "predGross.tertiary.education.enrollment...., deviance reduction: 3.48528"
## [1] "predUrban_population, deviance reduction: -16.6714"
Based on this result, we can see that the
“predGross.tertiary.education.enrollment….” variable has the highest
deviance reduction value, indicating that adding this numerical variable
will have a more significant positive impact on improving the model’s
fit and reducing bias.
On the other hand, the “predUrban_population” variable has a negative
deviance reduction value, suggesting that including this numerical
variable would worsen the model’s fit, increasing bias. This variable
does not provide strong predictive information for the target
variable.
Therefore, we are considering using “predPopulation” and
“predGross.tertiary.education.enrollment….” as feature variables for
multivariate analysis and experimenting with the inclusion of the
“predUrban_population” variable in different models to observe its
impact on the models.
2.6 Multivariate Model - Decision tree classifier
Create a multivariate model and examine the statistical analysis
results.We will start by building a model using the best-performing
“predPopulation”和”predGross.tertiary.education.enrollment….”
variables.
First, we use the two high-performing variables to build a decision
tree model called “t2model” and output the AUC value.
(fV <- paste(outcome,'> 0 ~ ',
paste(c("predPopulation", "predGross.tertiary.education.enrollment...."), collapse=' + '),
sep=''))
## [1] "Unemployment> 0 ~ predPopulation + predGross.tertiary.education.enrollment...."
t2model <- rpart(fV, data=dTrain)
print(calcAUC(predict(t2model, newdata=dTrain), dTrain[,outcome]))
## [1] 0.7833333
print(calcAUC(predict(t2model, newdata=dCal), dCal[,outcome]))
## [1] 1
print(calcAUC(predict(t2model, newdata=dTest), dTest[,outcome]))
## [1] 0.4666667
We noticed that the performance of this model is not better than the
univariate model. However, achieving a perfect score of 1 on the
validation set is a positive result that we are happy to see.
Next, we will calculate performance metrics such as accuracy,
precision, recall, and F1 score to evaluate the model.
To begin, let’s create a function to calculate these results for
repeated use:
performanceMeasures <- function(ytrue, ypred, model.name = "model", threshold=0.5) {
# compute the normalised deviance
dev.norm <- -2 * logLikelihood(ytrue, ypred)/length(ypred)
# compute the confusion matrix
cmat <- table(actual = ytrue, predicted = ypred >= threshold)
accuracy <- sum(diag(cmat)) / sum(cmat)
precision <- cmat[2, 2] / sum(cmat[, 2])
recall <- cmat[2, 2] / sum(cmat[2, ])
f1 <- 2 * precision * recall / (precision + recall)
data.frame(model = model.name, precision = precision,
recall = recall, f1 = f1, dev.norm = dev.norm)
}
For more visually appealing output, we can define a function to set
up Pander formatting:
panderOpt <- function(){
library(pander)
# setting up Pander Options
panderOptions("plain.ascii", TRUE)
panderOptions("keep.trailing.zeros", TRUE)
panderOptions("table.style", "simple")
}
Let’s create a function for tabular output:
t_pretty_perf_table <- function(model, xtrain, ytrain, xcal, ycal,
xtest, ytest, threshold=0.5) {
# Option setting for Pander
panderOpt()
perf_justify <- "lrrrr"
# call the predict() function to do the predictions
pred_train <- predict(model, newdata=xtrain)
pred_cal <- predict(model, newdata=xcal)
pred_test <- predict(model, newdata=xtest)
# comparing performance on training, calibration,test
trainperf_df <- performanceMeasures(
ytrain, pred_train, model.name="training", threshold=threshold)
calperf_df <- performanceMeasures(
ycal, pred_cal, model.name="calibration", threshold=threshold)
testperf_df <- performanceMeasures(
ytest, pred_test, model.name="test", threshold=threshold)
# combine the three performance data frames using rbind()
perftable <- rbind(trainperf_df, calperf_df, testperf_df)
pandoc.table(perftable, justify = perf_justify)
}
Call the t_pretty_perf_table() function to output the table:
t_pretty_perf_table(t2model, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....")], dTest[,outcome]==pos)
##
##
## model precision recall f1 dev.norm
## ------------- ----------- -------- -------- ----------
## training 0.7143 0.8333 0.7692 1.460
## calibration 1.0000 1.0000 1.0000 1.203
## test 0.3333 0.3333 0.3333 1.148
Add the variable “predUrban_population”
We will now include the “predUrban_population” variable to build a
decision tree model. Let’s see how adding “predUrban_population” affects
the model’s performance:
(fV <- paste(outcome,'> 0 ~ ',
paste(c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population"), collapse=' + '),
sep=''))
## [1] "Unemployment> 0 ~ predPopulation + predGross.tertiary.education.enrollment.... + predUrban_population"
t3model <- rpart(fV, data=dTrain)
print(calcAUC(predict(t3model, newdata=dTrain), dTrain[,outcome]))
## [1] 0.7833333
print(calcAUC(predict(t3model, newdata=dCal), dCal[,outcome]))
## [1] 1
print(calcAUC(predict(t3model, newdata=dTest), dTest[,outcome]))
## [1] 0.4666667
Calculate the relevant evaluation metrics:
t_pretty_perf_table(t3model, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTest[,outcome]==pos)
##
##
## model precision recall f1 dev.norm
## ------------- ----------- -------- -------- ----------
## training 0.7143 0.8333 0.7692 1.460
## calibration 1.0000 1.0000 1.0000 1.203
## test 0.3333 0.3333 0.3333 1.148
We can observe that whether it’s the AUC value or precision/recall/F1
score/normalized bias, adding the “predUrban_population” variable does
not have a significant impact. It doesn’t decrease the model’s
performance as expected, but it also doesn’t further improve it.
From the results, we can see that the decision tree model performs
relatively well on the training and validation sets but slightly worse
on the test set, indicating some overfitting.
Let’s also take a look at the ROC curve for further analysis:
plot_roc <- function(predcol1, outcol1, predcol2, outcol2){
roc_1 <- rocit(score=predcol1, class=outcol1==pos)
roc_2 <- rocit(score=predcol2, class=outcol2==pos)
plot(roc_1, col = c("blue","green"), lwd = 3,
legend = FALSE,YIndex = FALSE, values = TRUE, asp=1)
lines(roc_2$TPR ~ roc_2$FPR, lwd = 3,
col = c("red","green"), asp=1)
legend("bottomright", col = c("blue","red", "green"),
c("Test Data", "Training Data", "Null Model"), lwd = 2)
}
pred_test_roc <- predict(t3model, newdata=dTest)
pred_train_roc <- predict(t3model, newdata=dTrain)
plot_roc(pred_test_roc, dTest[[outcome]],
pred_train_roc, dTrain[[outcome]])

We can see that the model not only suffers from overfitting, but the
AUC on the test set is even lower than the null model, indicating that
the model’s performance is not very optimistic.
To improve the model, let’s make adjustments to some of the model’s
hyperparameters. We will build a new model called “t3model2” based on
the three-parameter model.
t3model2 <- rpart(fV, data=dTrain,
control=rpart.control(cp=0.001, minsplit=2,
minbucket=2, maxdepth=5))
print(calcAUC(predict(t3model2, newdata=dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")]), dTrain[,outcome]))
## [1] 0.9666667
print(calcAUC(predict(t3model2, newdata=dCal[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")]), dCal[,outcome]))
## [1] 0.6666667
print(calcAUC(predict(t3model2, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")]), dTest[,outcome]))
## [1] 0.4666667
t_pretty_perf_table(t3model2, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTest[,outcome]==pos)
##
##
## model precision recall f1 dev.norm
## ------------- ----------- -------- -------- ----------
## training 0.8462 0.9167 0.8800 0.4321
## calibration 0.5000 1.0000 0.6667 1.1562
## test 0.4000 0.3333 0.3636 0.2092
pred_test_roc <- predict(t3model2, newdata=dTest)
pred_train_roc <- predict(t3model2, newdata=dTrain)
plot_roc(pred_test_roc, dTest[[outcome]],
pred_train_roc, dTrain[[outcome]])

Since your dataset is relatively small, you’ve noticed an improvement
in model performance by adjusting the minsplit and minbucket parameters
to 2. Precision and F1 on the test set have improved, and dev.norm has
decreased. However, the model still suffers from overfitting, which
might be due to the limited sample size.
Let’s take a look at the ROC curve for the model before and after the
modifications:
pred_test_old <- predict(t3model, newdata=dTest)
pred_test_new <- predict(t3model2, newdata=dTest)
plot_roc(pred_test_old, dTest[[outcome]],
pred_test_new, dTest[[outcome]])

The areas under the curves are the same.
Finally, let’s print the decision tree and create a
visualization:
print(t3model2)
## n= 27
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 27 6.6666670 0.4444444
## 2) predGross.tertiary.education.enrollment....< 0.4166713 13 1.6923080 0.1538462
## 4) predGross.tertiary.education.enrollment....< 0.1667962 7 0.0000000 0.0000000 *
## 5) predGross.tertiary.education.enrollment....>=0.1667962 6 1.3333330 0.3333333
## 10) predPopulation< 0.1667592 3 0.0000000 0.0000000 *
## 11) predPopulation>=0.1667592 3 0.6666667 0.6666667 *
## 3) predGross.tertiary.education.enrollment....>=0.4166713 14 2.8571430 0.7142857
## 6) predUrban_population< 0.4166713 6 1.3333330 0.3333333
## 12) predPopulation< 0.4166713 4 0.7500000 0.2500000 *
## 13) predPopulation>=0.4166713 2 0.5000000 0.5000000 *
## 7) predUrban_population>=0.4166713 8 0.0000000 1.0000000 *
par(cex=1.5)
rpart.plot(t3model2)

2.7 Multivariate Model - Naive Bayes model
To establish a Naive Bayes model using the “predPopulation” and
“predGross.tertiary.education.enrollment….” variables.
Following the lab content, use the ‘naiveBayes’ function from the
‘e1071’ package to construct a model. Start with using the two variables
with better performance:
# construct the formula f as a character string
(f <- paste(outcome,'> 0 ~ ',
paste(c("predPopulation", "predGross.tertiary.education.enrollment...."), collapse=' + '),
sep=''))
## [1] "Unemployment> 0 ~ predPopulation + predGross.tertiary.education.enrollment...."
nb2model <- naiveBayes(as.formula(f), data=dTrain)
dTrain$nbpred <- predict(nb2model, newdata=dTrain, type='raw')[,'TRUE']
dCal$nbpred <- predict(nb2model, newdata=dCal, type='raw')[,'TRUE']
dTest$nbpred <- predict(nb2model, newdata=dTest, type='raw')[,'TRUE']
cat('The AUC value for the training set is:', calcAUC(dTrain$nbpred, dTrain[,outcome]), "\n")
## The AUC value for the training set is: 0.9527778
cat('The AUC value for the calibration set is:', calcAUC(dCal$nbpred, dCal[,outcome]), "\n")
## The AUC value for the calibration set is: 1
cat('The AUC value for the test set is:', calcAUC(dTest$nbpred, dTest[,outcome]), "\n")
## The AUC value for the test set is: 0.4
To output an evaluation metrics table, we are building a new output
table function:
nb_pretty_perf_table <- function(model, xtrain, ytrain, xcal, ycal,
xtest, ytest, threshold=0.5) {
# Option setting for Pander
panderOpt()
perf_justify <- "lrrrr"
# call the predict() function to do the predictions
pred_train <- predict(model, newdata=xtrain, type='raw')[,'TRUE']
pred_cal <- predict(model, newdata=xcal, type='raw')[,'TRUE']
pred_test <- predict(model, newdata=xtest, type='raw')[,'TRUE']
# comparing performance on training, calibration,test
trainperf_df <- performanceMeasures(
ytrain, pred_train, model.name="training", threshold=threshold)
calperf_df <- performanceMeasures(
ycal, pred_cal, model.name="calibration", threshold=threshold)
testperf_df <- performanceMeasures(
ytest, pred_test, model.name="test", threshold=threshold)
# combine the three performance data frames using rbind()
perftable <- rbind(trainperf_df, calperf_df, testperf_df)
pandoc.table(perftable, justify = perf_justify)
}
Build the table:
nb_pretty_perf_table(nb2model, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....")], dTest[,outcome]==pos)
##
##
## model precision recall f1 dev.norm
## ------------- ----------- -------- -------- ----------
## training 0.7857 0.9167 0.8462 3.240
## calibration 0.6667 1.0000 0.8000 3.374
## test 0.2857 0.3333 0.3077 2.657
Plot the ROC curve:
pred_test_roc <- predict(nb2model, newdata=dTest, type='raw')[,'TRUE']
pred_train_roc <- predict(nb2model, newdata=dTrain, type= 'raw')[,'TRUE']
plot_roc(pred_test_roc, dTest[[outcome]],
pred_train_roc, dTrain[[outcome]])

Based on the data results and the ROC curve above, we can observe
that even when using the Naive Bayes model, the performance on the test
set is still not as good as the null model.
Add the variable “predUrban_population”
Let’s try adding the variable “predUrban_population” to the model and
see if it improves the performance:
(f <- paste(outcome,'> 0 ~ ',
paste(c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population"), collapse=' + '),
sep=''))
## [1] "Unemployment> 0 ~ predPopulation + predGross.tertiary.education.enrollment.... + predUrban_population"
nb3model <- naiveBayes(as.formula(f), data=dTrain)
dTrain$nbpred <- predict(nb3model, newdata=dTrain, type='raw')[,'TRUE']
dCal$nbpred <- predict(nb3model, newdata=dCal, type='raw')[,'TRUE']
dTest$nbpred <- predict(nb3model, newdata=dTest, type='raw')[,'TRUE']
cat('The AUC value for the training set is:', calcAUC(dTrain$nbpred, dTrain[,outcome]), "\n")
## The AUC value for the training set is: 0.95
cat('The AUC value for the calibration set is:', calcAUC(dCal$nbpred, dCal[,outcome]), "\n")
## The AUC value for the calibration set is: 0.8333333
cat('The AUC value for the test set is:', calcAUC(dTest$nbpred, dTest[,outcome]), "\n")
## The AUC value for the test set is: 0.5416667
After adding the new variable, there is a slight improvement in the
AUC value on the test set, even though this variable was initially
considered to have a negative impact on the model. Let’s continue to
observe the evaluation metrics and the ROC curve.
nb_pretty_perf_table(nb3model, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTest[,outcome]==pos)
##
##
## model precision recall f1 dev.norm
## ------------- ----------- -------- -------- ----------
## training 1.0 0.8333 0.9091 4.154
## calibration 0.5 0.5000 0.5000 4.621
## test 0.4 0.3333 0.3636 3.392
pred_test_roc <- predict(nb3model, newdata=dTest, type='raw')[,'TRUE']
pred_train_roc <- predict(nb3model, newdata=dTrain, type= 'raw')[,'TRUE']
plot_roc(pred_test_roc, dTest[[outcome]],
pred_train_roc, dTrain[[outcome]])

The test set precision and F1 score have both improved, and the
training set precision even reached a perfect score. However, dev.norm
has also shown some improvement. Observing the ROC curve, we can see
that the performance has indeed improved with the addition of a
variable. However, severe overfitting still exists.
Let’s examine the dual density plot:
ggplot(dCal) + geom_density(aes(x=nbpred, color=as.factor(Unemployment)))

In comparison to the previous plot, it’s evident that the dual
density plot of this model has a larger area under the positive curve
when using the negative curve as a reference. This indicates that the
model’s predictions for the negative class have improved. Similar to
before, the area under the positive curve in the high-frequency region
is larger, indicating better performance in predicting the positive
class.
However, the model still exhibits overfitting issues, which are
likely related to the small dataset size and data quality concerns.
Furthermore, we do not consider applying LIME explanations to the
above two models. This is because decision trees inherently possess high
interpretability, with each node corresponding to a criterion based on a
specific feature. Thus, the model’s prediction process can be
intuitively understood. Naive Bayes utilizes conditional probability
relationships between features for classification, and these conditional
probabilities can typically be directly estimated from the data.
Consequently, the model’s operational principles are transparent,
eliminating the need for additional explanations. These two models do
not fall into the category of “black-box” models, as their decision
processes are highly transparent, rendering LIME explanations
unnecessary.
2.8 Model Comparison and Evaluation.
Let’s begin by comparing the performance of the decision tree model
and the Naive Bayes model for the same problem, starting with a
comparison of the AUC values:
cat('The AUC value for the test set is in tree:', calcAUC(predict(t3model2, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")]), dTest[,outcome]), "\n")
## The AUC value for the test set is in tree: 0.4666667
cat('The AUC value for the test set is in nb:', calcAUC(dTest$nbpred, dTest[,outcome]), "\n")
## The AUC value for the test set is in nb: 0.5416667
It’s clear that Naive Bayes significantly outperforms the decision
tree, while the decision tree even has an AUC value below 0.5,
performing worse than the null model.
Let’s also consider the AUC values of the three univariate models for
further observation:
for (var in vars) {
aucs <- rep(0,100)
for (rep in 1:length(aucs)) {
useForCalRep <- rbinom(n=nrow(dTrainAll), size=1, prob=0.1) > 0
# Check if the training subset contains only one class
while (length(unique(dTrainAll[useForCalRep, outcome])) < 2) {
useForCalRep <- rbinom(n = nrow(dTrainAll), size = 1, prob = 0.1) > 0
}
predRep <- mkPredN(dTrainAll[!useForCalRep, outcome],
dTrainAll[!useForCalRep, var],
dTrainAll[useForCalRep, var])
aucs[rep] <- calcAUC(predRep, dTrainAll[useForCalRep,outcome])
}
print(sprintf("The AUC value for %s is: %4.3f", var, mean(aucs)))
}
## [1] "The AUC value for Population is: 0.616"
## [1] "The AUC value for Gross.tertiary.education.enrollment.... is: 0.737"
## [1] "The AUC value for Urban_population is: 0.414"
From these results, it’s evident that the performance of the two
multivariate models is not better than that of the two best-performing
univariate models.Next, let’s compare the evaluation metrics for
different models:
Naive Bayes:
nb_pretty_perf_table(nb3model, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTest[,outcome]==pos)
##
##
## model precision recall f1 dev.norm
## ------------- ----------- -------- -------- ----------
## training 1.0 0.8333 0.9091 4.154
## calibration 0.5 0.5000 0.5000 4.621
## test 0.4 0.3333 0.3636 3.392
Decision tree:
t_pretty_perf_table(t3model2, dTrain[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTrain[,outcome]==pos, dCal[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dCal[,outcome]==pos, dTest[c("predPopulation", "predGross.tertiary.education.enrollment....", "predUrban_population")], dTest[,outcome]==pos)
##
##
## model precision recall f1 dev.norm
## ------------- ----------- -------- -------- ----------
## training 0.8462 0.9167 0.8800 0.4321
## calibration 0.5000 1.0000 0.6667 1.1562
## test 0.4000 0.3333 0.3636 0.2092
From the parameters, it appears that the two models have nearly
identical performance, especially on the test set where the first three
metrics are the same. However, the decision tree has slightly lower
bias, indicating relatively better model performance.
Finally, let’s draw the ROC curves for comparison. We’ll choose the
two best-performing univariate models and the two multivariate models to
plot on the test set:
plot_roc <- function(predcol1, outcol1, predcol2, outcol2, predcol3, outcol3, predcol4, outcol4){
roc_1 <- rocit(score=predcol1, class=outcol1==pos)
roc_2 <- rocit(score=predcol2, class=outcol2==pos)
roc_3 <- rocit(score=predcol3, class=outcol3==pos)
roc_4 <- rocit(score=predcol4, class=outcol4==pos)
plot(roc_1, col = "blue", lwd = 2, legend = FALSE, YIndex = FALSE, values = TRUE, cex = 3)
lines(roc_2$TPR ~ roc_2$FPR, col = "red", lwd = 2, cex = 3)
lines(roc_3$TPR ~ roc_3$FPR, col = "green", lwd = 2, cex = 3)
lines(roc_4$TPR ~ roc_4$FPR, col = "orange", lwd = 2, cex = 3)
legend("bottomright", col = c("blue", "red", "green", "orange"),
c("PopSinModel", "GroSinModel", "DtreeModel", "NBModel"), lwd = 2, cex = 1)
}
pred_Pop_roc <- dTest$predPopulation
pred_Gro_roc <- dTest$predGross.tertiary.education.enrollment....
pred_tree_roc <- pred_test_new
pred_nb_roc <- pred_test_roc
plot_roc(pred_Pop_roc, dTest[,outcome], pred_Gro_roc, dTest[,outcome], pred_tree_roc, dTest[,outcome], pred_nb_roc, dTest[,outcome])

From this graph, we can analyze that there are a total of five models
(including the null model), and they show relatively similar performance
based on the AUC. Interestingly, the univariate model of the education
rate and the decision tree model exhibit similar trends when the false
positive rate (FPR) is greater than 0.4. There might be some underlying
correlation here, which could be a topic for further investigation in
this project.
In summary, overall, the two multivariate models didn’t outperform
the top-performing univariate models. This may be due to the limited
dataset size and a small number of variables, making univariate models
more suitable in this scenario. The two multivariate models show similar
performance and have their respective strengths, but they both exhibit
overfitting issues. In terms of variables, the total population and
education rate of a country have a significant impact on the
unemployment rate prediction, while the urban population rate has less
influence.
Part 2 - Clustering
Problem Analysis and Data Processing
In the problem analysis and data processing phase, we have a cleaned
data frame, ‘df,’ which contains 48 countries and 4 observational
metrics, including Population, Gross tertiary education enrollment,
Unemployment rate, and Urban population. For the clustering problem, we
also want to include longitude and latitude as reference metrics. In the
previous analysis, it was observed that there is an equal number of
missing values for longitude, latitude, and unemployment rate in the
‘youtubedata’ data frame, leading to the suspicion that the missing rows
are the same. Let’s conduct a check to verify this:
count_suspected_na_rows <- sum(rowSums(is.na(youtubedataT2[, c("Population", "Unemployment.rate", "Urban_population", "Latitude", "Longitude")])) == 5)
print(count_suspected_na_rows)
## [1] 123
Indeed, it appears that the missing rows in the ‘youtubedata’ are the
same as those in the ‘df’ data frame. So, let’s start by extracting the
columns from the ‘youtubedata’ that contain country names and
longitude/latitude. We will then join these columns with the ‘df’ data
frame based on the country names.
country_lon_lat <- youtubedataT2 %>%
select(Country, Latitude, Longitude) %>%
filter(!is.na(Latitude))
country_lon_lat <- unique(country_lon_lat)
# Using left_join function
df <- left_join(df, country_lon_lat, by = "Country")
Let’s take a look at the new data frame.
str(df)
## 'data.frame': 48 obs. of 7 variables:
## $ Country : chr "India" "United States" "Japan" "Russia" ...
## $ Population : num 1.37e+09 3.28e+08 1.26e+08 1.44e+08 5.17e+07 ...
## $ Gross.tertiary.education.enrollment....: num 28.1 88.2 63.2 81.9 94.3 60 68.9 51.3 90 88.5 ...
## $ Unemployment.rate : num 5.36 14.7 2.29 4.59 4.15 ...
## $ Urban_population : num 4.71e+08 2.71e+08 1.16e+08 1.08e+08 4.21e+07 ...
## $ Latitude : num 20.6 37.1 36.2 61.5 35.9 ...
## $ Longitude : num 79 -95.7 138.3 105.3 127.8 ...
summary(df)
## Country Population Gross.tertiary.education.enrollment....
## Length:48 Min. :2.025e+05 Min. : 7.60
## Class :character 1st Qu.:1.583e+07 1st Qu.:35.80
## Mode :character Median :4.185e+07 Median :57.45
## Mean :1.200e+08 Mean :55.17
## 3rd Qu.:9.744e+07 3rd Qu.:72.85
## Max. :1.398e+09 Max. :94.30
## Unemployment.rate Urban_population Latitude Longitude
## Min. : 0.750 Min. : 35588 Min. :-38.42 Min. :-172.10
## 1st Qu.: 3.743 1st Qu.: 9651217 1st Qu.: 11.27 1st Qu.: -60.56
## Median : 5.315 Median : 30732090 Median : 28.07 Median : 25.18
## Mean : 6.507 Mean : 69730921 Mean : 24.23 Mean : 16.65
## 3rd Qu.: 9.193 3rd Qu.: 57178091 3rd Qu.: 40.82 3rd Qu.: 81.81
## Max. :14.720 Max. :842933962 Max. : 61.92 Max. : 138.25
There are no missing values, and the ranges of all numerical values
are reasonable.
To mitigate the impact of distance due to unit discrepancies, we will
first preprocess the data. We’ll use the scale() function in R to
transform all columns, setting their means to 0 and standard deviations
to 1.
We’ll extract all columns except the first one, which represents the
countries, and apply the scale() function. Then, we’ll use the head()
function to view the updated data frame.
vars.to.use <- colnames(df)[-1]
scaled_df <- scale(df[,vars.to.use])
head(scaled_df)
## Population Gross.tertiary.education.enrollment.... Unemployment.rate
## [1,] 4.52338317 -1.0932714 -0.3007812
## [2,] 0.75558519 1.3342759 2.1487417
## [3,] 0.02243169 0.3244808 -1.1059242
## [4,] 0.08829138 1.0798075 -0.5027226
## [5,] -0.24801010 1.5806659 -0.6181177
## [6,] -0.19311673 0.1952270 -0.6967962
## Urban_population Latitude Longitude
## [1,] 2.88159445 -0.1452756 0.8016324
## [2,] 1.44282075 0.5138341 -1.4456142
## [3,] 0.33067912 0.4784579 1.5644134
## [4,] 0.27252653 1.4900703 1.1407072
## [5,] -0.19835940 0.4665887 1.4295084
## [6,] -0.09925512 1.2445123 -0.2584490
Using hierarchical cluster analysis.
We will begin by performing hierarchical clustering analysis using
the Euclidean distance and the “ward.D2” linkage method. To find the
best model, we can explore various options by changing these two
parameters.
For the dist() function, we can consider the following methods for
distance calculation: “euclidean,” “manhattan,” “maximum,” “canberra,”
“binary,” and “minkowski.”
For the hclust() function, we can explore various linkage methods,
including “complete,” “single,” “average,” “ward.D,” “ward.D2,” and
“centroid.”
Let’s proceed with hierarchical clustering using the Euclidean
distance and “ward.D2” linkage method. If you wish to explore different
combinations, please specify your preferences.
d <- dist(scaled_df, method="euclidean")
pfit <- hclust(d, method="ward.D2")
To visualize the hierarchical clustering, we will create a
dendrogram. We will proceed with ‘k = 5’ clusters for now. We can
explore other values of ‘k’ later if needed.
Let’s create the dendrogram with ‘k = 5’ clusters:
plot(pfit, labels=df$Country, main="Cluster Dendrogram for Country", cex = 0.7)
rect.hclust(pfit, k=5)

To gain a clearer understanding of the characteristics of each
group’s members, we’ll use the cutree() function to extract the group
assignments for each member.
groups <- cutree(pfit, k=5)
Let’s create a function to print the feature values for each
group:
print_clusters <- function(df, groups, cols_to_print) {
Ngroups <- max(groups)
for (i in 1:Ngroups) {
print(paste("cluster", i))
print(df[groups == i, cols_to_print])
}
}
Call the function and select the features you want to analyze:
cols_to_print <- c("Country", "Population", "Gross.tertiary.education.enrollment....", "Unemployment.rate", "Urban_population", "Latitude", "Longitude")
print_clusters(df, groups, cols_to_print)
## [1] "cluster 1"
## Country Population Gross.tertiary.education.enrollment.... Unemployment.rate
## 1 India 1366417754 28.1 5.36
## 40 China 1397715000 50.6 4.32
## Urban_population Latitude Longitude
## 1 471031528 20.59368 78.96288
## 40 842933962 35.86166 104.19540
## [1] "cluster 2"
## Country Population Gross.tertiary.education.enrollment....
## 2 United States 328239523 88.2
## 8 Brazil 212559417 51.3
## 9 Argentina 44938712 90.0
## 10 Chile 18952038 88.5
## 11 Cuba 11333483 41.4
## 12 El Salvador 6453553 29.4
## 16 Colombia 50339443 55.3
## 17 Barbados 287025 65.4
## 18 Mexico 126014024 40.2
## 20 Spain 47076781 88.9
## 24 Venezuela 28515829 79.3
## 43 Ecuador 17373662 44.9
## 45 Peru 32510453 70.7
## 48 Samoa 202506 7.6
## Unemployment.rate Urban_population Latitude Longitude
## 2 14.70 270663028 37.090240 -95.71289
## 8 12.08 183241641 -14.235004 -51.92528
## 9 9.79 41339571 -38.416097 -63.61667
## 10 7.09 16610135 -35.675147 -71.54297
## 11 1.64 8739135 21.521757 -77.78117
## 12 4.11 4694702 13.794185 -88.89653
## 16 9.71 40827302 4.570868 -74.29733
## 17 10.33 89431 13.193887 -59.54320
## 18 3.42 102626859 23.634501 -102.55278
## 20 13.96 37927409 40.463667 -3.74922
## 24 8.80 25162368 6.423750 -66.58973
## 43 3.97 11116711 -1.831239 -78.18341
## 45 3.31 25390339 -9.189967 -75.01515
## 48 8.36 35588 -13.759029 -172.10463
## [1] "cluster 3"
## Country Population Gross.tertiary.education.enrollment....
## 3 Japan 126226568 63.2
## 4 Russia 144373535 81.9
## 5 South Korea 51709098 94.3
## 13 Pakistan 216565318 9.0
## 14 Philippines 108116615 35.5
## 15 Thailand 69625582 49.3
## 19 United Arab Emirates 9770529 36.8
## 22 Indonesia 270203917 36.3
## 25 Kuwait 4207083 54.4
## 28 Singapore 5703569 84.8
## 29 Australia 25766605 68.0
## 38 Vietnam 96462106 28.5
## 39 Malaysia 32447385 45.1
## 46 Bangladesh 167310838 20.6
## Unemployment.rate Urban_population Latitude Longitude
## 3 2.29 115782416 36.204824 138.25292
## 4 4.59 107683889 61.524010 105.31876
## 5 4.15 42106719 35.907757 127.76692
## 13 4.45 79927762 30.375321 69.34512
## 14 2.15 50975903 12.879721 121.77402
## 15 0.75 35294600 15.870032 100.99254
## 19 2.35 8479744 23.424076 53.84782
## 22 4.69 151509724 -0.789275 113.92133
## 25 2.18 4207083 29.311660 47.48177
## 28 4.11 5703569 1.352083 103.81984
## 29 5.27 21844756 -25.274398 133.77514
## 38 2.01 35332140 14.058324 108.27720
## 39 3.32 24475766 4.210484 101.97577
## 46 4.19 60987417 23.684994 90.35633
## [1] "cluster 4"
## Country Population Gross.tertiary.education.enrollment....
## 6 United Kingdom 66834405 60.0
## 7 Canada 36991981 68.9
## 21 Saudi Arabia 34268528 68.0
## 27 Netherlands 17332850 85.0
## 30 Italy 60297396 61.9
## 31 Germany 83132799 70.2
## 32 France 67059887 65.6
## 33 Sweden 10285453 67.0
## 35 Ukraine 44385155 82.7
## 36 Latvia 1912789 88.1
## 37 Switzerland 8574832 59.6
## 47 Finland 5520314 88.2
## Unemployment.rate Urban_population Latitude Longitude
## 6 3.85 55908316 55.37805 -3.435973
## 7 5.56 30628482 56.13037 -106.346771
## 21 5.93 28807838 23.88594 45.079162
## 27 3.20 15924729 52.13263 5.291266
## 30 9.89 42651966 41.87194 12.567380
## 31 3.04 64324835 51.16569 10.451526
## 32 8.43 54123364 46.22764 2.213749
## 33 6.48 9021165 60.12816 18.643501
## 35 8.88 30835699 48.37943 31.165580
## 36 6.52 1304943 56.87964 24.603189
## 37 4.58 6332428 46.81819 8.227512
## 47 6.59 4716888 61.92411 25.748151
## [1] "cluster 5"
## Country Population Gross.tertiary.education.enrollment....
## 23 Turkey 83429615 23.9
## 26 Jordan 10101694 34.4
## 34 Afghanistan 38041754 9.7
## 41 Iraq 39309783 16.2
## 42 Egypt 100388073 35.2
## 44 Morocco 36910560 35.9
## Unemployment.rate Urban_population Latitude Longitude
## 23 13.49 63097818 38.96375 35.24332
## 26 14.72 9213048 30.58516 36.23841
## 34 11.12 9797273 33.93911 67.70995
## 41 12.82 27783368 33.22319 43.67929
## 42 10.76 42895824 26.82055 30.80250
## 44 9.02 22975026 31.79170 -7.09262
A preliminary analysis of these clusters reveals the following:
Cluster 1: Includes India and China, both densely populated countries
with a significant urban population. These two countries are located in
East Asia.
Cluster 3: This cluster includes countries like Japan, Russia, South
Korea, Pakistan, and the Philippines. The unemployment rate is
relatively low in these countries. They are distributed across Asia and
the Russian region.
Cluster 4: This cluster comprises countries such as the United
Kingdom, Canada, Saudi Arabia, the Netherlands, Italy, and Germany. This
cluster has a high gross tertiary education enrollment rate. These
countries are primarily located in Europe and the Middle East.
Cluster 5: This cluster includes countries like Turkey, Jordan,
Afghanistan, Iraq, Egypt, and Morocco. The gross tertiary education
enrollment rate is relatively low, and the unemployment rate is high in
these countries. They are primarily located in the Middle East and North
Africa.
Cluster 2: The characteristics of this cluster are not very
distinct.
These findings provide an initial understanding of the clusters and
their distinguishing features. Further analysis and interpretation can
be performed based on these observations.
Performance Evaluation for Different Values of ‘k’
In order to find the optimal value for performance, we create a
function to calculate the CH index and WSS values of the model and
generate plots of k against these two parameters.
sqr_euDist <- function(x, y) {
sum((x - y) ^ 2)
}
# Function to calculate WSS of a cluster
wss <- function(clustermat) {
c0 <- colMeans(clustermat)
sum(apply( clustermat, 1, FUN=function(row) {sqr_euDist(row, c0)} ))
}
# Function to calculate the total WSS
wss_total <- function(scaled_df, labels) {
wss.sum <- 0
k <- length(unique(labels))
for (i in 1:k)
wss.sum <- wss.sum + wss(subset(scaled_df, labels == i))
wss.sum
}
# Function to calculate total sum of squared (TSS) distance of data points about the (global) mean
tss <- function(scaled_df) {
wss(scaled_df)
}
CH_index <- function(scaled_df, kmax, method="kclust") {
npts <- nrow(scaled_df)
wss.value <- numeric(kmax)
wss.value[1] <- wss(scaled_df)
d <- dist(scaled_df, method="euclidean")
pfit <- hclust(d, method="ward.D2")
for (k in 2:kmax) {
labels <- cutree(pfit, k=k)
wss.value[k] <- wss_total(scaled_df, labels)
}
bss.value <- tss(scaled_df) - wss.value
B <- bss.value / (0:(kmax-1))
W <- wss.value / (npts - 1:kmax)
data.frame(k = 1:kmax, CH_index = B/W, WSS = wss.value)
}
# calculate the CH criterion
crit.df <- CH_index(scaled_df, 10, method="hclust")
fig1 <- ggplot(crit.df, aes(x=k, y=CH_index)) +
geom_point() + geom_line(colour="red") +
scale_x_continuous(breaks=1:10, labels=1:10) +
labs(y="CH index") + theme(text=element_text(size=20))
fig2 <- ggplot(crit.df, aes(x=k, y=WSS), color="blue") +
geom_point() + geom_line(colour="blue") +
scale_x_continuous(breaks=1:10, labels=1:10) +
theme(text=element_text(size=20))
grid.arrange(fig1, fig2, nrow=1)

We aim for a higher CH index value and a lower WSS value. The hope is
that the rate at which the WSS decreases will slow down for k values
beyond the optimal number of clusters. Therefore, for the CH index, the
best performance is achieved when it reaches its maximum value, which in
the current situation is k = 8. As for the WSS, the “elbow” point on the
graph is the preferred choice, although there is no clear elbow in the
current scenario. So, we select k = 8 as the optimal k value.
plot(pfit, labels=df$Country, main="Cluster Dendrogram for Country")
rect.hclust(pfit, k=8)

Stability Assessment
After selecting the optimal k value, we conduct a stability
assessment of the algorithm using clusterboot().
kbest.p <- 8
cboot.hclust <- clusterboot(scaled_df, clustermethod=hclustCBI,
method="ward.D2", k=kbest.p)
Review the execution results:
summary(cboot.hclust$result)
## Length Class Mode
## result 7 hclust list
## noise 1 -none- logical
## nc 1 -none- numeric
## clusterlist 8 -none- list
## partition 48 -none- numeric
## clustermethod 1 -none- character
## nccl 1 -none- numeric
And we alse counted how many times each cluster was dissolved:
# cboot.hclust$bootbrd = number of times a cluster is desolved.
(values <- 1 - cboot.hclust$bootbrd/100)
## [1] 0.89 0.42 0.57 0.95 0.77 0.88 0.93 0.96
# larger values here means higher stablility
cat("So clusters", order(values)[5], "and", order(values)[4], "are highly stable")
## So clusters 1 and 6 are highly stable
(mean(values))
## [1] 0.79625
The model’s average values are 0.79625.
Method Selection
We can optimize the model and explore problems by changing the
algorithm’s Distance Method and Hierarchical Clustering Method.
Different Distance Methods are suitable for different problems, and we
can find the most suitable method for this problem through
experimentation. For example, selecting “manhattan”:
d <- dist(scaled_df, method = "manhattan")
pfit <- hclust(d, method = "ward.D2")
View the CH index and WSS values:
CH_index <- function(scaled_df, kmax, method="kclust") {
npts <- nrow(scaled_df)
wss.value <- numeric(kmax)
wss.value[1] <- wss(scaled_df)
d <- dist(scaled_df, method = "manhattan")
pfit <- hclust(d, method = "ward.D2")
for (k in 2:kmax) {
labels <- cutree(pfit, k=k)
wss.value[k] <- wss_total(scaled_df, labels)
}
bss.value <- tss(scaled_df) - wss.value
B <- bss.value / (0:(kmax-1))
W <- wss.value / (npts - 1:kmax)
data.frame(k = 1:kmax, CH_index = B/W, WSS = wss.value)
}
# calculate the CH criterion
crit.df <- CH_index(scaled_df, 10, method="hclust")
fig1 <- ggplot(crit.df, aes(x=k, y=CH_index)) +
geom_point() + geom_line(colour="red") +
scale_x_continuous(breaks=1:10, labels=1:10) +
labs(y="CH index") + theme(text=element_text(size=20))
fig2 <- ggplot(crit.df, aes(x=k, y=WSS), color="blue") +
geom_point() + geom_line(colour="blue") +
scale_x_continuous(breaks=1:10, labels=1:10) +
theme(text=element_text(size=20))
grid.arrange(fig1, fig2, nrow=1)

We will find that changing the distance method will affect the
optimal k value of the model. Therefore, under different methods, we
need to use different numbers of clusters for clustering. At this point,
the optimal k value is 6. Check for stability:
kbest.p <- 6
cboot.hclust <- clusterboot(scaled_df, clustermethod=hclustCBI,
method="ward.D2", k=kbest.p)
Review the results:
(values <- 1 - cboot.hclust$bootbrd/100)
## [1] 0.93 0.56 0.98 0.95 0.76 0.91
(mean(values))
## [1] 0.8483333
The mean value is 0.848333. The stability has increased with the
“manhattan” model.
Now, let’s try different hierarchical clustering methods, such as
selecting “complete”:
d <- dist(scaled_df, method = "euclidean")
pfit <- hclust(d, method = "complete")
View CH index and WSS values:
CH_index <- function(scaled_df, kmax, method="kclust") {
npts <- nrow(scaled_df)
wss.value <- numeric(kmax)
wss.value[1] <- wss(scaled_df)
d <- dist(scaled_df, method = "euclidean")
pfit <- hclust(d, method = "complete")
for (k in 2:kmax) {
labels <- cutree(pfit, k=k)
wss.value[k] <- wss_total(scaled_df, labels)
}
bss.value <- tss(scaled_df) - wss.value
B <- bss.value / (0:(kmax-1))
W <- wss.value / (npts - 1:kmax)
data.frame(k = 1:kmax, CH_index = B/W, WSS = wss.value)
}
# calculate the CH criterion
crit.df <- CH_index(scaled_df, 10, method="hclust")
fig1 <- ggplot(crit.df, aes(x=k, y=CH_index)) +
geom_point() + geom_line(colour="red") +
scale_x_continuous(breaks=1:10, labels=1:10) +
labs(y="CH index") + theme(text=element_text(size=20))
fig2 <- ggplot(crit.df, aes(x=k, y=WSS), color="blue") +
geom_point() + geom_line(colour="blue") +
scale_x_continuous(breaks=1:10, labels=1:10) +
theme(text=element_text(size=20))
grid.arrange(fig1, fig2, nrow=1)

Select 7 as the optimal k value:
kbest.p <- 7
cboot.hclust <- clusterboot(scaled_df, clustermethod=hclustCBI,
method="complete", k=kbest.p)
(values <- 1 - cboot.hclust$bootbrd/100)
## [1] 0.90 0.49 0.93 0.67 0.71 0.88 0.90
(mean(values))
## [1] 0.7828571
The mean is 0.7828571. Under the same distance method, this model’s
stability is not as good as when using “ward.D2”.
Data Visualization
Finally, we proceed with data visualization for different values of
‘k’ by “euclidean” dist method and “ward.D2” hclust method.
d <- dist(scaled_df, method="euclidean")
pfit <- hclust(d, method="ward.D2")
We will use the ‘procomp()’ function to perform Principal Component
Analysis (PCA) and reduce the data to two dimensions for ease of
plotting.
princ <- prcomp(scaled_df)
# focus on the first two principal components
nComp <- 2
First, define a function to find the convex hull:
find_convex_hull <- function(proj2Ddf, groups) {
do.call(rbind,
lapply(unique(groups),
FUN = function(c) {
f <- subset(proj2Ddf, cluster==c);
f[chull(f),]
}
)
)
}
Using a loop, let’s create plots for different values of ‘k’:
fig <- c()
kvalues <- seq(5,8)
for (i in kvalues) {
groups <- cutree(pfit, k = i)
# project scaled_df onto the first 2 principal components to form a new 2-column data frame.
project2D <- as.data.frame(predict(princ, newdata=scaled_df)[,1:nComp])
# combine with `groups` and df$Country to form a 4-column data frame
hclust.project2D <- cbind(project2D, cluster=as.factor(groups), country=df$Country)
hclust.hull <- find_convex_hull(hclust.project2D, groups)
assign(paste0("fig", i),
ggplot(hclust.project2D, aes(x=PC1, y=PC2)) +
geom_point(aes(shape=cluster, color=cluster)) +
geom_polygon(data=hclust.hull, aes(group=cluster, fill=as.factor(cluster)),
alpha=0.4, linetype=0) +
scale_x_continuous(limits = c(-0.1, 0.07)) +
scale_y_continuous(limits = c(-0.3, 0.4)) +
labs(title = sprintf("k = %d", i)) +
theme(legend.position="none", text=element_text(size=10))
)
}
grid.arrange(fig5, fig6, fig7, fig8, ncol=2, heights = c(10, 10), widths = c(10, 10))

From this visualization, we can observe that regardless of the value
of ‘k,’ there is significant overlap between each cluster. This
indicates a high degree of similarity between different clusters, making
it challenging to distinguish them distinctly. This might be due to the
choice of features or distance metrics not effectively separating these
clusters, or it could be related to the data not being sufficiently
dispersed.